\clearpage

# Matching

A unique matching tweet is identified with the following R function

```{r echo = T, eval = F}

MatchIt::matchit(treat ~
                   user_favourites_count + user_followers_count + user_friends_count +
                   user_listed_count + user_statuses_count + user_verified + url + media,
                 data = this_engagement,
                 method = "nearest", distance = "glm")

```

where `treat` is a tweet

1. published by a specific group of accounts (i.e., journalists, politicians, far right or climate action accounts); or 
2. containing a link to a list of domains such as facebook.com or youtube.com but also to sites identified by mediabiasfactcheck.com as publishing low-credibility news stories, high-credibility news stories or conspiracy-pseudoscience news stories.


The result is used to fit the data with

```{r echo = T, eval = F}

glm(rt_n_unreciprocated ~ treat +
      user_favourites_count + user_followers_count + user_friends_count +
      user_listed_count + user_statuses_count + user_verified + url + media, 
    data = m.data, weights = weights, family = 'poisson')

```

where `rt_n_unreciprocated` is the number of retweets from accounts that had not exchnaged reciprocal retweets with the account in the previous 7 days and `weights` are computed by the `matchit` function.

```{r matching-summary, include = T, fig.width = 9, fig.height = 6, fig.cap = "Distribution of regression estimates based on 'nearest neighbor matching.", cache = T}

library(tidyverse)
library(cowplot)

load("data/bushfire_par_summary.df.RData")
load("data/covid_par_summary.df.RData")

label_order <- 
  c("journalist", "politician", "far-right", '#ClimateAction', 
    'random 1', 'random 2', 'random 3',
    "Twitter link",
    'YouTube link', 'Facebook link',  "mainstream media link",
    'low credibility link',
    
    'random link 1', 'random link 2', 'random link 3'
  )

cowplot::plot_grid(bushfire_par_summary.df %>%
                     dplyr::filter(what == 'rt_n_unreciprocated' & 
                                     name %in% c('journalist', 'far right', 
                                                 '#ClimateAction',
                                                 'politician', 
                                                 # 'misinformation',
                                                 # 'conspiracy theory',
                                                 # 'problematic speech',
                                                 'random 1', 'random 2', 'random 3',
                                                 'YouTube link', 'Facebook link', 
                                                 "mainstream media link", "Twitter link",
                                                 'low credibility link',
                                                 'random link 1', 'random link 2', 'random link 3')) %>%
                     dplyr::mutate(name = recode(name, "far right" = "far-right"),
                                   name = factor(name, levels = label_order)) %>%
                     ggplot(aes(x = name, y = est)) +
                     coord_cartesian(ylim = c(-100, 100)) +
                     scale_y_continuous(
                       trans = "pseudo_log", 
                       breaks = c(-100, -10, -1, 0, 1, 10, 100)
                     ) +
                     geom_boxplot() +
                     geom_hline(yintercept = 0, colour = 'red', alpha = .5) +
                     theme_bw() +
                     theme(axis.text.x=element_text(angle=45, hjust=1)) +
                     labs(x = "Treatment category"),
                   covid_par_summary.df %>%
                     dplyr::filter(what == 'rt_n_unreciprocated' & 
                                     name %in% c('journalist', 'far right', '#ClimateAction',
                                                 
                                                 # 'misinformation',
                                                 # 'conspiracy theory',
                                                 # 'problematic speech',
                                                 'politician', 'random 1', 'random 2', 'random 3',
                                                 'YouTube link', 'Facebook link', 
                                                 "mainstream media link", "Twitter link",
                                                 'low credibility link',
                                                 'random link 1', 'random link 2', 'random link 3')) %>%
                     dplyr::mutate(name = recode(name, "far right" = "far-right"),
                                   name = factor(name, levels = label_order)) %>%
                     ggplot(aes(x = name, y = est)) +
                     coord_cartesian(ylim = c(-100, 100)) +
                     scale_y_continuous(
                       trans = "pseudo_log", 
                       breaks = c(-100, -10, -1, 0, 1, 10, 100)
                     ) +
                     geom_boxplot() +
                     geom_hline(yintercept = 0, colour = 'red', alpha = .5) +
                     theme_bw()  +
                     theme(axis.text.x=element_text(angle=45, hjust=1)) +
                     labs(x = "Treatment category"),
                   ncol = 1, labels = c("Bushfires", "Covid-19"))


```

\clearpage

```{r matching-points, fig.width = 11, fig.height = 10, fig.cap = "Regression estimates based on nearest neighbor matching over time."}

cowplot::plot_grid(
  bushfire_par_summary.df %>%
    dplyr::filter(what == 'rt_n_unreciprocated' &
                    !name %in% c("conspiracy theory", "government", 
                                 "problematic speech", "misinformation")) %>%
    ggplot(aes(x = posix, y = est)) +
    geom_point(aes(colour = significance, alpha = significance)) +
    geom_smooth(se = FALSE, colour = 'black', alpha = .5, method = 'lm') +
    geom_hline(yintercept = 0, size = .5) +
    geom_vline(xintercept = as.POSIXct("2020-03-18")) +
    coord_cartesian(ylim = c(-100, 100)) +
    scale_y_continuous(
      trans = "pseudo_log"
      # , 
      # breaks = c(-100, -50, -10, -5, 0, 5, 10, 50, 100)
    ) +
    scale_alpha_manual(values = c(1, 1, 0.2)) +
    scale_colour_brewer(palette = "Set1", direction = 1) +
    facet_wrap('name') +
    labs(x = NULL, y = "rt_n_unreciprocated") +
    theme_bw(),
  covid_par_summary.df %>%
    dplyr::filter(what == 'rt_n_unreciprocated' & 
                    !name %in% c("conspiracy theory", "government", 
                                 "problematic speech", "misinformation")) %>%
    ggplot(aes(x = posix, y = est)) +
    geom_point(aes(colour = significance, alpha = significance)) +
    geom_smooth(se = FALSE, colour = 'black', alpha = .5, method = 'lm') +
    geom_hline(yintercept = 0, size = .5) +
    geom_vline(xintercept = as.POSIXct("2020-03-18")) +
    coord_cartesian(ylim = c(-100, 100)) +
    scale_y_continuous(
      trans = "pseudo_log"
      # , 
      # breaks = c(-100, -50, -10, -5, 0, 5, 10, 50, 100)
    ) +
    scale_alpha_manual(values = c(1, 1, 0.2)) +
    scale_colour_brewer(palette = "Set1", direction = 1) +
    facet_wrap('name') +
    labs(x = NULL, y = "rt_n_unreciprocated") +
    theme_bw(),
  ncol = 1, labels = c("Bushfires", "Covid"))

```


```{r results = "asis"}

covid_par_summary.df %>%
  dplyr::filter(what == "rt_n_unreciprocated" & 
                  name %in% c("far right", "journalist", "random 1", "random 2", "random 3")) %>%
  dplyr::mutate(name = case_when(grepl("random", name) ~ "random",
                                 name == "far right" ~ "far~right",
                                 name == "journalist" ~ "journalist")) %>%
  dplyr::group_by(name) %>%
  dplyr::summarize(not_signif = sum(significance == "not significant") / n() * 100,
                   pos_signif = sum(significance == "positive")/ n() * 100,
                   neg_signif = sum(significance == "negative")/ n() *  100) %>%
  kbl(format = "latex", caption = "\\% of estimates in each significance category (p<.05)", digits = 2,
      booktabs = T) %>%
  kable_styling(latex_options = c("striped"))

```

\clearpage

# Reach

```{r par_matched_diff_mean, include = T, fig.width = 9, fig.height = 9, fig.cap = "Association between degree distribution within the far-right retweet network and mean difference of retweets in comparison to matched tweets.", cache = T}

load("data/bushfire_par_matched.df.RData")
load("data/covid_par_matched.df.RData")

load("data/urls.dt.RData")

mediabiasfactcheck <- 
  read.csv("data/mediabiasfactcheck.csv")

mediabiasfactcheck$domain <- 
  gsub("http(s)?://|www\\.|/", "", mediabiasfactcheck$domain)


## Bushfire

bushfire_par_matched_wide.df <- 
  bushfire_par_matched.df %>%
  tidyr::pivot_wider(id_cols = c(time, subclass), names_from = 'treat', values_from = "rt_n") %>%
  dplyr::mutate(diff = `1` - `0`)

bushfire_par_matched_wide.df <-
  bushfire_par_matched_wide.df %>%
  dplyr::left_join(bushfire_par_matched.df %>%
                     dplyr::filter(treat == 1) %>%
                     dplyr::select(time, tweet_id, subclass),
                   by = c("time", 'subclass')) %>%
  dplyr::left_join(bushfire_tweets.dt %>%
                     dplyr::select(tweet_id, user_id), 
                   by =  'tweet_id')

# bushfire_par_matched_wide.df %>%
#   dplyr::group_by(user_id) %>%
#   dplyr::summarise(mean = mean(diff),
#                    median = median(diff),
#                    p05 = quantile(diff, prob = .05),
#                    p95 = quantile(diff, prob = .95),
#                    p25 = quantile(diff, prob = .25),
#                    p75 = quantile(diff, prob = .75)) %>%
#   dplyr::ungroup() %>%
#   dplyr::mutate(user_id = factor(user_id, levels = user_id[order(mean, decreasing = T)]),
#                 class = case_when(p25 <= 0 & p75 >= 0 ~ "Null",
#                                   p25 >= 0 & p75 > 0 ~ "Overperforming",
#                                   p25 < 0 & p75 <= 0 ~ "Underperforming")) %>%
#   ggplot(aes(y = mean, x = user_id)) +
#   geom_errorbar(aes(ymin = p05, ymax = p95)) +
#   geom_point(size = 1, aes(colour = class)) +
#   theme(axis.text.x = element_blank(),
#         axis.ticks.x =  element_blank(),
#         panel.grid.major.x = element_blank()) +
#   coord_cartesian(ylim = c(-50, 50))

bushfire_par_matched_diff_mean.df <-
  bushfire_par_matched_wide.df %>%
  dplyr::group_by(user_id) %>%
  dplyr::summarise(mean = mean(diff),
                   median = median(diff),
                   p05 = quantile(diff, prob = .05),
                   p95 = quantile(diff, prob = .95),
                   p25 = quantile(diff, prob = .25),
                   p75 = quantile(diff, prob = .75))



bushfire_farright_retweet.g <-
  bushfire_tweets.dt %>%
  dplyr::select(user_id, retweeted_status_user_id) %>%
  dplyr::filter(!is.na(retweeted_status_user_id) &
                  (user_id %in% farright_user_ids & 
                     retweeted_status_user_id %in% farright_user_ids)) %>%
  igraph::graph_from_data_frame(vertices = 
                                  data.frame(id = 
                                               farright_user_ids[
                                                 farright_user_ids %in% 
                                                   unique(bushfire_tweets.dt$user_id) |
                                                   farright_user_ids %in%
                                                   unique(bushfire_tweets.dt$retweeted_status_user_id)]))


bushfire_urls_class.dt <- 
  bushfire_urls.dt %>%
  dplyr::filter(tweet_id %in% bushfire_par_matched_wide.df$tweet_id) %>%
  dplyr::mutate(domain = gsub("http(s)?://|www\\.|/", "", domain),
                class = case_when(domain %in% 
                                    australia_news_sources$domain ~
                                    "mainstream media",
                                  domain == "twitter.com" ~
                                    "Twitter.com",
                                  domain == "youtu.be" ~
                                    "YouTube.com",
                                  domain %in% 
                                    mediabiasfactcheck$domain[
                                      mediabiasfactcheck$Credibility == 'Low Credibility'] ~
                                    "low credibility media",
                                  TRUE ~ "other domain")) %>%
  dplyr::left_join(data.frame(class = c("low credibility media", 
                                        "YouTube.com",
                                        "other domain",
                                        "mainstream media",
                                        "Twitter.com"),
                              order = 1:5),
                   by = "class") %>%
  dplyr::arrange(tweet_id, class) %>%
  dplyr::distinct(tweet_id, .keep_all = T)


bushfire_par_matched_wide.df <- 
  bushfire_par_matched_wide.df %>%
  left_join(bushfire_urls_class.dt %>%
              dplyr::select(tweet_id, class), 
            by = 'tweet_id')

bushfire_par_matched_wide.df$class[is.na(bushfire_par_matched_wide.df$class)] <- "no URL"

V(bushfire_farright_retweet.g)$indegree <-
  igraph::degree(bushfire_farright_retweet.g, mode = 'in')

E(bushfire_farright_retweet.g)$weight <- 
  1

bushfire_farright_retweet.g <-
  igraph::simplify(bushfire_farright_retweet.g)

V(bushfire_farright_retweet.g)$indegree_simp <-
  igraph::degree(simplify(bushfire_farright_retweet.g), mode = 'in')

E(bushfire_farright_retweet.g)$mutual <- 
  igraph::is.mutual(bushfire_farright_retweet.g)

V(bushfire_farright_retweet.g)$mutual_degree <- 
  igraph::degree(bushfire_farright_retweet.g %>%
                   igraph::delete.edges(which(!E(bushfire_farright_retweet.g)$mutual)))


bushfire_par_matched_diff_mean.df$rt_indegree <-
  V(bushfire_farright_retweet.g)$indegree[match(bushfire_par_matched_diff_mean.df$user_id,
                                                V(bushfire_farright_retweet.g)$name)]

bushfire_par_matched_diff_mean.df$rt_indegree_simp <-
  V(bushfire_farright_retweet.g)$indegree_simp[match(bushfire_par_matched_diff_mean.df$user_id,
                                                     V(bushfire_farright_retweet.g)$name)]

bushfire_par_matched_diff_mean.df$rt_mutual_degree <-
  V(bushfire_farright_retweet.g)$mutual_degree[match(bushfire_par_matched_diff_mean.df$user_id,
                                                     V(bushfire_farright_retweet.g)$name)]


## Covid

covid_par_matched_wide.df <- 
  covid_par_matched.df %>%
  tidyr::pivot_wider(id_cols = c(time, subclass), names_from = 'treat', values_from = "rt_n") %>%
  dplyr::mutate(diff = `1` - `0`)

covid_par_matched_wide.df <-
  covid_par_matched_wide.df %>%
  dplyr::left_join(covid_par_matched.df %>%
                     dplyr::filter(treat == 1) %>%
                     dplyr::select(time, tweet_id, subclass),
                   by = c("time", 'subclass')) %>%
  dplyr::left_join(covid_tweets.dt %>%
                     dplyr::select(tweet_id, user_id), 
                   by =  'tweet_id')

# sum(!covid_par_matched_wide.df$user_id %in% farright_user_ids)

# covid_par_matched_wide.df %>%
#   dplyr::group_by(user_id) %>%
#   dplyr::summarise(mean = mean(diff),
#                    median = median(diff),
#                    p05 = quantile(diff, prob = .05),
#                    p95 = quantile(diff, prob = .95),
#                    p25 = quantile(diff, prob = .25),
#                    p75 = quantile(diff, prob = .75)) %>%
#   dplyr::ungroup() %>%
#   dplyr::mutate(user_id = factor(user_id, levels = user_id[order(mean, decreasing = T)]),
#                 class = case_when(p25 <= 0 & p75 >= 0 ~ "Null",
#                                   p25 >= 0 & p75 > 0 ~ "Overperforming",
#                                   p25 < 0 & p75 <= 0 ~ "Underperforming")) %>%
#   ggplot(aes(y = mean, x = user_id)) +
#   geom_errorbar(aes(ymin = p05, ymax = p95)) +
#   geom_point(size = 1, aes(colour = class)) +
#   theme(axis.text.x = element_blank(),
#         axis.ticks.x =  element_blank(),
#         panel.grid.major.x = element_blank()) +
#   coord_cartesian(ylim = c(-50, 50))

covid_par_matched_diff_mean.df <-
  covid_par_matched_wide.df %>%
  dplyr::group_by(user_id) %>%
  dplyr::summarise(mean = mean(diff),
                   median = median(diff),
                   p05 = quantile(diff, prob = .05),
                   p95 = quantile(diff, prob = .95),
                   p25 = quantile(diff, prob = .25),
                   p75 = quantile(diff, prob = .75))

# Questions: Overperformance explained by centrality in far-right mutual retweet network.  YES

covid_farright_retweet.g <-
  covid_tweets.dt %>%
  dplyr::select(user_id, retweeted_status_user_id) %>%
  dplyr::filter(!is.na(retweeted_status_user_id) &
                  (user_id %in% farright_user_ids & 
                     retweeted_status_user_id %in% farright_user_ids)) %>%
  igraph::graph_from_data_frame(vertices = 
                                  data.frame(id = 
                                               farright_user_ids[
                                                 farright_user_ids %in% 
                                                   unique(covid_tweets.dt$user_id) |
                                                   farright_user_ids %in%
                                                   unique(covid_tweets.dt$retweeted_status_user_id)]))


covid_urls_class.dt <- 
  covid_urls.dt %>%
  dplyr::filter(tweet_id %in% covid_par_matched_wide.df$tweet_id) %>%
  dplyr::mutate(domain = gsub("http(s)?://|www\\.|/", "", domain),
                class = case_when(domain %in% 
                                    australia_news_sources$domain ~
                                    "mainstream media",
                                  domain == "twitter.com" ~
                                    "Twitter.com",
                                  domain == "youtu.be" ~
                                    "YouTube.com",
                                  domain %in% 
                                    mediabiasfactcheck$domain[
                                      mediabiasfactcheck$Credibility == 'Low Credibility'] ~
                                    "low credibility media",
                                  TRUE ~ "other domain")) %>%
  dplyr::left_join(data.frame(class = c("low credibility media", 
                                        "YouTube.com",
                                        "other domain",
                                        "mainstream media",
                                        "Twitter.com"),
                              order = 1:5),
                   by = "class") %>%
  dplyr::arrange(tweet_id, class) %>%
  dplyr::distinct(tweet_id, .keep_all = T)


covid_par_matched_wide.df <- 
  covid_par_matched_wide.df %>%
  left_join(covid_urls_class.dt %>%
              dplyr::select(tweet_id, class), 
            by = 'tweet_id')

covid_par_matched_wide.df$class[is.na(covid_par_matched_wide.df$class)] <- "no URL"

V(covid_farright_retweet.g)$indegree <-
  igraph::degree(covid_farright_retweet.g, mode = 'in')

E(covid_farright_retweet.g)$weight <- 
  1

covid_farright_retweet.g <-
  igraph::simplify(covid_farright_retweet.g)

V(covid_farright_retweet.g)$indegree_simp <-
  igraph::degree(simplify(covid_farright_retweet.g), mode = 'in')

E(covid_farright_retweet.g)$mutual <- 
  igraph::is.mutual(covid_farright_retweet.g)

V(covid_farright_retweet.g)$mutual_degree <- 
  igraph::degree(covid_farright_retweet.g %>%
                   igraph::delete.edges(which(!E(covid_farright_retweet.g)$mutual)))


covid_par_matched_diff_mean.df$rt_indegree <-
  V(covid_farright_retweet.g)$indegree[match(covid_par_matched_diff_mean.df$user_id,
                                             V(covid_farright_retweet.g)$name)]

covid_par_matched_diff_mean.df$rt_indegree_simp <-
  V(covid_farright_retweet.g)$indegree_simp[match(covid_par_matched_diff_mean.df$user_id,
                                                  V(covid_farright_retweet.g)$name)]

covid_par_matched_diff_mean.df$rt_mutual_degree <-
  V(covid_farright_retweet.g)$mutual_degree[match(covid_par_matched_diff_mean.df$user_id,
                                                  V(covid_farright_retweet.g)$name)]

cowplot::plot_grid(
  cowplot::plot_grid(
    ggplot(bushfire_par_matched_diff_mean.df, aes(x = factor(rt_indegree), y = mean)) + 
      geom_hline(yintercept = 0) +
      geom_boxplot() +
      geom_jitter(alpha = .5, shape = 1) +
      scale_y_continuous(trans = "pseudo_log") +
      labs(x = "far-right RT network: indegree", y = 'mean RT difference from matched') +
      theme_bw(),
    ggplot(bushfire_par_matched_diff_mean.df, aes(x = factor(rt_indegree_simp), y = mean)) + 
      geom_hline(yintercept = 0) +
      geom_boxplot() +
      geom_jitter(alpha = .5, shape = 1) +
      scale_y_continuous(trans = "pseudo_log") +
      labs(x = "far-right RT network (simplified): indegree", 
           y = 'mean RT difference from matched') +
      theme_bw(),
    ggplot(bushfire_par_matched_diff_mean.df, aes(x = factor(rt_mutual_degree), y = mean)) + 
      geom_hline(yintercept = 0) +
      geom_boxplot() +
      geom_jitter(alpha = .5, shape = 1) +
      scale_y_continuous(trans = "pseudo_log") +
      labs(x = "far-right mutual RT network: degree", 
           y = 'mean RT difference from matched') +
      theme_bw(),
    ncol = 1),
  
  cowplot::plot_grid(
    ggplot(covid_par_matched_diff_mean.df, aes(x = factor(rt_indegree), y = mean)) + 
      geom_hline(yintercept = 0) +
      geom_boxplot() +
      geom_jitter(alpha = .5, shape = 1) +
      scale_y_continuous(trans = "pseudo_log") +
      labs(x = "far-right RT network: indegree", y = 'mean RT difference from matched') +
      theme_bw(),
    ggplot(covid_par_matched_diff_mean.df, aes(x = factor(rt_indegree_simp), y = mean)) + 
      geom_hline(yintercept = 0) +
      geom_boxplot() +
      geom_jitter(alpha = .5, shape = 1) +
      scale_y_continuous(trans = "pseudo_log") +
      labs(x = "far-right RT network (simplified): indegree", 
           y = 'mean RT difference from matched') +
      theme_bw(),
    ggplot(covid_par_matched_diff_mean.df, aes(x = factor(rt_mutual_degree), y = mean)) + 
      geom_hline(yintercept = 0) +
      geom_boxplot() +
      geom_jitter(alpha = .5, shape = 1) +
      scale_y_continuous(trans = "pseudo_log") +
      labs(x = "far-right mutual RT network: degree", 
           y = 'mean RT difference from matched') +
      theme_bw(),
    ncol = 1),
  ncol = 2, labels = c("Bushfires", "Covid-19"))

```

\clearpage


